Loading Packages & Data

library(tidyverse)
library(dplyr)
library(gridExtra)
library(kableExtra)
library(ggthemr)
library(leaflet)
library(leaflet.extras)
library(wordcloud)
library(tm)
library(NLP)
library(corrplot)
ggthemr('dust')
airbnb <- read.csv("AB_NYC_2019.csv", stringsAsFactors = F, na.strings = c(""))

Data exploration

Structure and features

head(airbnb) %>% kable() %>% kable_styling() %>% scroll_box(width="910px")
id name host_id host_name neighbourhood_group neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365
2539 Clean & quiet apt home by the park 2787 John Brooklyn Kensington 40.64749 -73.97237 Private room 149 1 9 2018-10-19 0.21 6 365
2595 Skylit Midtown Castle 2845 Jennifer Manhattan Midtown 40.75362 -73.98377 Entire home/apt 225 1 45 2019-05-21 0.38 2 355
3647 THE VILLAGE OF HARLEM….NEW YORK ! 4632 Elisabeth Manhattan Harlem 40.80902 -73.94190 Private room 150 3 0 NA NA 1 365
3831 Cozy Entire Floor of Brownstone 4869 LisaRoxanne Brooklyn Clinton Hill 40.68514 -73.95976 Entire home/apt 89 1 270 2019-07-05 4.64 1 194
5022 Entire Apt: Spacious Studio/Loft by central park 7192 Laura Manhattan East Harlem 40.79851 -73.94399 Entire home/apt 80 10 9 2018-11-19 0.10 1 0
5099 Large Cozy 1 BR Apartment In Midtown East 7322 Chris Manhattan Murray Hill 40.74767 -73.97500 Entire home/apt 200 3 74 2019-06-22 0.59 1 129
str(airbnb)
## 'data.frame':    48895 obs. of  16 variables:
##  $ id                            : int  2539 2595 3647 3831 5022 5099 5121 5178 5203 5238 ...
##  $ name                          : chr  "Clean & quiet apt home by the park" "Skylit Midtown Castle" "THE VILLAGE OF HARLEM....NEW YORK !" "Cozy Entire Floor of Brownstone" ...
##  $ host_id                       : int  2787 2845 4632 4869 7192 7322 7356 8967 7490 7549 ...
##  $ host_name                     : chr  "John" "Jennifer" "Elisabeth" "LisaRoxanne" ...
##  $ neighbourhood_group           : chr  "Brooklyn" "Manhattan" "Manhattan" "Brooklyn" ...
##  $ neighbourhood                 : chr  "Kensington" "Midtown" "Harlem" "Clinton Hill" ...
##  $ latitude                      : num  40.6 40.8 40.8 40.7 40.8 ...
##  $ longitude                     : num  -74 -74 -73.9 -74 -73.9 ...
##  $ room_type                     : chr  "Private room" "Entire home/apt" "Private room" "Entire home/apt" ...
##  $ price                         : int  149 225 150 89 80 200 60 79 79 150 ...
##  $ minimum_nights                : int  1 1 3 1 10 3 45 2 2 1 ...
##  $ number_of_reviews             : int  9 45 0 270 9 74 49 430 118 160 ...
##  $ last_review                   : chr  "2018-10-19" "2019-05-21" NA "2019-07-05" ...
##  $ reviews_per_month             : num  0.21 0.38 NA 4.64 0.1 0.59 0.4 3.47 0.99 1.33 ...
##  $ calculated_host_listings_count: int  6 2 1 1 1 1 1 1 1 4 ...
##  $ availability_365              : int  365 355 365 194 0 129 0 220 0 188 ...
summary(airbnb)
##        id               name              host_id           host_name        
##  Min.   :    2539   Length:48895       Min.   :     2438   Length:48895      
##  1st Qu.: 9471945   Class :character   1st Qu.:  7822033   Class :character  
##  Median :19677284   Mode  :character   Median : 30793816   Mode  :character  
##  Mean   :19017143                      Mean   : 67620011                     
##  3rd Qu.:29152178                      3rd Qu.:107434423                     
##  Max.   :36487245                      Max.   :274321313                     
##                                                                              
##  neighbourhood_group neighbourhood         latitude       longitude     
##  Length:48895        Length:48895       Min.   :40.50   Min.   :-74.24  
##  Class :character    Class :character   1st Qu.:40.69   1st Qu.:-73.98  
##  Mode  :character    Mode  :character   Median :40.72   Median :-73.96  
##                                         Mean   :40.73   Mean   :-73.95  
##                                         3rd Qu.:40.76   3rd Qu.:-73.94  
##                                         Max.   :40.91   Max.   :-73.71  
##                                                                         
##   room_type             price         minimum_nights    number_of_reviews
##  Length:48895       Min.   :    0.0   Min.   :   1.00   Min.   :  0.00   
##  Class :character   1st Qu.:   69.0   1st Qu.:   1.00   1st Qu.:  1.00   
##  Mode  :character   Median :  106.0   Median :   3.00   Median :  5.00   
##                     Mean   :  152.7   Mean   :   7.03   Mean   : 23.27   
##                     3rd Qu.:  175.0   3rd Qu.:   5.00   3rd Qu.: 24.00   
##                     Max.   :10000.0   Max.   :1250.00   Max.   :629.00   
##                                                                          
##  last_review        reviews_per_month calculated_host_listings_count
##  Length:48895       Min.   : 0.010    Min.   :  1.000               
##  Class :character   1st Qu.: 0.190    1st Qu.:  1.000               
##  Mode  :character   Median : 0.720    Median :  1.000               
##                     Mean   : 1.373    Mean   :  7.144               
##                     3rd Qu.: 2.020    3rd Qu.:  2.000               
##                     Max.   :58.500    Max.   :327.000               
##                     NA's   :10052                                   
##  availability_365
##  Min.   :  0.0   
##  1st Qu.:  0.0   
##  Median : 45.0   
##  Mean   :112.8   
##  3rd Qu.:227.0   
##  Max.   :365.0   
## 
airbnb[,"id"] <- NULL
airbnb[,"host_id"] <- NULL
airbnb[,"host_name"] <- NULL

Missing Data

missing_airbnb <- as.data.frame(table(which(is.na(airbnb),arr.ind = T)[,2]))
missing_airbnb$perc <- missing_airbnb$Freq / dim(airbnb)[1]
missing_airbnb$Var1 <- as.character(missing_airbnb$Var1)

missing_airbnb$Var1[1] <- colnames(airbnb)[as.numeric(missing_airbnb$Var1[1])]
missing_airbnb$Var1[2] <- colnames(airbnb)[as.numeric(missing_airbnb$Var1[2])]
missing_airbnb$Var1[3] <- colnames(airbnb)[as.numeric(missing_airbnb$Var1[3])]

ggplot(missing_airbnb, aes(x=missing_airbnb$Var1, y=missing_airbnb$perc)) +
  geom_bar(stat = "identity", aes(color = I('white')), size = 0.3) +
  coord_flip() +
  ylim(0,1) +
  ylab("Percentage missing") +
  xlab("Column name") +
  ggtitle("Missing Data", subtitle = "name has less than 0.001 percentage missing")

airbnb_name <- airbnb[,"name"]
airbnb[, "name"] <- NULL
airbnb[, "last_review"] <- NULL
airbnb[, "reviews_per_month"] <- NULL

The both column “reviews_per_month” and “last_review” are missing about 20.56%, which means these two variable have some relation. Because of 20.56% lossing, we could not use these two column. Although “name” has las than 0.001 percentage missing, we cannot fill them, so we delete those rows.

Data Visualization

Location & Price

Location and Price are the most important factors for people choosing homestay.

Location

In the data, neighbourhood_group means location and neighbourhood means area

# neighbourhood_group
airbnb$neighbourhood_group <- as.factor(airbnb$neighbourhood_group)
summary(airbnb$neighbourhood_group)
##         Bronx      Brooklyn     Manhattan        Queens Staten Island 
##          1091         20104         21661          5666           373
NGlocation <- as.data.frame(table(airbnb$neighbourhood_group))
colnames(NGlocation) <- c('Location', 'Freq')

ggplot(NGlocation, aes(x=NGlocation$Location, y=NGlocation$Freq, fill=NGlocation$Location)) +
  geom_bar(stat = "identity", aes(color = I('white')), size = 0.3) + 
  theme(legend.position = "none") +
  xlab("Location") +
  ylab("Number of Houses")

As we can see, Brookly and Manhattan are two most popular place that people rent their houses.

Area

airbnb$neighbourhood <- as.factor(airbnb$neighbourhood)
head(summary(airbnb$neighbourhood), 20)
##       Williamsburg Bedford-Stuyvesant             Harlem           Bushwick 
##               3920               3714               2658               2465 
##    Upper West Side     Hell's Kitchen       East Village    Upper East Side 
##               1971               1958               1853               1798 
##      Crown Heights            Midtown        East Harlem         Greenpoint 
##               1564               1545               1117               1115 
##            Chelsea    Lower East Side            Astoria Washington Heights 
##               1113                911                900                899 
##       West Village Financial District           Flatbush       Clinton Hill 
##                768                744                621                572
# It is to much area; but as we can see, arounding 10 Area that people are most liking to rent their house.
NArea <- as.data.frame(table(airbnb$neighbourhood))
colnames(NArea) <- c('Area', 'Freq')
NArea <- filter(NArea, Freq > 1000)
NArea <- arrange(NArea, NArea$Freq)

ggplot(NArea, aes(x=NArea$Area, y=NArea$Freq)) +
  geom_bar(stat = "identity") + 
  theme(legend.position = "none") +
  xlab("Area") +
  ylab("Number of Houses") +
  coord_flip()

Latitude & Longitude

nycLatLong <- data.frame(lat = airbnb$latitude, lng = airbnb$longitude)

nycLatLong %>%
    leaflet() %>%
    addTiles() %>%
  addProviderTiles('HikeBike.HikeBike') %>%
  addProviderTiles(providers$Stamen.Toner, group = "Toner") %>%
  addProviderTiles(providers$CartoDB.DarkMatter, group = "DarkMatter") %>%
  addProviderTiles(providers$Esri.WorldImagery, group = "ESRI-Sat") %>%
  addWebGLHeatmap(lng = airbnb$longitude, lat = airbnb$latitude, size = 500) %>%
    addLayersControl(
    baseGroups = c("OSM (default)", 
      "Toner", 
      "DarkMatter",
      "ESRI-Sat"),
    options = layersControlOptions(collapsed = FALSE)
    )

Price

AirPrice <- as.data.frame(table(airbnb$price))
colnames(AirPrice) <- c("Price", "Freq")
ggplot(airbnb, aes(x=airbnb$price)) + 
  geom_density() +
  ggtitle("Distribution of price",
          subtitle = "The distribution is very skewed") +
  xlab("Price")

The distribution is very skewed, and the graph means the most Price is under $2500, so that we change the x-axis.

AirPrice <- as.data.frame(table(airbnb$price))
colnames(AirPrice) <- c("Price", "Freq")
ggplot(airbnb, aes(x=airbnb$price)) + 
  geom_density() +
  ggtitle("Distribution of price",
          subtitle = "Mean price = $142.31") +
  xlab("Price") +
  scale_x_log10() +
  geom_vline(xintercept = round(mean(airbnb$price), 2), size = 1)

Is any relation between price and location?

airbnb_nh <- airbnb %>%
  group_by(neighbourhood_group) %>%
  summarise(price = round(mean(price), 2))


ggplot(airbnb, aes(price)) +
  geom_density() +
  ggtitle("Transformed distribution of price\n by neighbourhood groups",
          subtitle = expression("With" ~'log'[10] ~ "transformation of x-axis")) +
  geom_vline(data = airbnb_nh, aes(xintercept = price), size = 1) +
  geom_text(data = airbnb_nh,y = 1.5, aes(x = price + 1400, label = paste("Mean  = ",price)), color = "darkgreen", size = 4) +
  facet_wrap(~neighbourhood_group) +
  scale_x_log10() 

Five locations have similar distribution, but Manhattan have the highest price in five locations.

Many factors can affect “Number of Review”

Name

wordcloud(airbnb_name)

In the name of the houses, hosts like use “private”, “manhattan”, “brooklyn”, “charming”, etc.

Rome Type

airbnb$room_type <- factor(airbnb$room_type)
RoomType <- as.data.frame(table(airbnb$room_type))
colnames(RoomType) <- c("RoomType","Freq")

str(RoomType)
## 'data.frame':    3 obs. of  2 variables:
##  $ RoomType: Factor w/ 3 levels "Entire home/apt",..: 1 2 3
##  $ Freq    : int  25409 22326 1160
ggplot(RoomType, aes(x=2, y=Freq, fill=RoomType)) + 
  geom_bar(stat = "identity", color = "white", width=0.8) +
  scale_fill_brewer(palette=4) +
  coord_polar(theta="y") +
  xlim(0.5, 2.5) +
  theme_void()

Most host will rent the “Entire home/apt” and “Private room”.

ggplot(airbnb, aes(x=airbnb$room_type, y=airbnb$number_of_reviews, fill=airbnb$room_type)) +
  geom_boxplot() +
  geom_hline(yintercept = mean(airbnb$number_of_reviews), color = "purple", linetype = 2) +
  scale_y_log10() +
  theme(legend.position = "none") + 
  xlab("Room Type") +
  ylab("Number of Reviews")

Each kinds of Room Type will have similar Number of Reviews

Multiple Regression

airMutiReg <- airbnb[,c("number_of_reviews", "neighbourhood_group", "room_type", "price", "minimum_nights", "availability_365")]
airMutiReg$neighbourhood_group <- factor(airMutiReg$neighbourhood_group)
str(airMutiReg)
## 'data.frame':    48895 obs. of  6 variables:
##  $ number_of_reviews  : int  9 45 0 270 9 74 49 430 118 160 ...
##  $ neighbourhood_group: Factor w/ 5 levels "Bronx","Brooklyn",..: 2 3 3 2 3 3 2 3 3 3 ...
##  $ room_type          : Factor w/ 3 levels "Entire home/apt",..: 2 1 2 1 1 1 2 2 2 1 ...
##  $ price              : int  149 225 150 89 80 200 60 79 79 150 ...
##  $ minimum_nights     : int  1 1 3 1 10 3 45 2 2 1 ...
##  $ availability_365   : int  365 355 365 194 0 129 0 220 0 188 ...
airMROneHot <- cbind(select_if(airMutiReg,is.numeric),as.data.frame(model.matrix(~room_type-1,airMutiReg)))
airMROneHot <- cbind(select_if(airMROneHot,is.numeric),as.data.frame(model.matrix(~neighbourhood_group-1,airMutiReg)))

cor(airMROneHot)
##                                  number_of_reviews       price minimum_nights
## number_of_reviews                      1.000000000 -0.04795423   -0.080116068
## price                                 -0.047954227  1.00000000    0.042799334
## minimum_nights                        -0.080116068  0.04279933    1.000000000
## availability_365                       0.172027581  0.08182883    0.144303063
## room_typeEntire home/apt              -0.010087231  0.25585665    0.074899803
## room_typePrivate room                  0.017253226 -0.24024642   -0.073836539
## room_typeShared room                  -0.023354904 -0.05361282   -0.004217946
## neighbourhood_groupBronx               0.009257903 -0.04102998   -0.018185701
## neighbourhood_groupBrooklyn            0.017413636 -0.09860261   -0.039658304
## neighbourhood_groupManhattan          -0.045820056  0.16397551    0.067362031
## neighbourhood_groupQueens              0.035966538 -0.08020500   -0.032629026
## neighbourhood_groupStaten Island       0.015088160 -0.01383994   -0.009399622
##                                  availability_365 room_typeEntire home/apt
## number_of_reviews                     0.172027581             -0.010087231
## price                                 0.081828827              0.255856647
## minimum_nights                        0.144303063              0.074899803
## availability_365                      1.000000000             -0.006804234
## room_typeEntire home/apt             -0.006804234              1.000000000
## room_typePrivate room                -0.010985839             -0.953470178
## room_typeShared room                  0.058293890             -0.162143592
## neighbourhood_groupBronx              0.060806165             -0.052092280
## neighbourhood_groupBrooklyn          -0.079670621             -0.073905066
## neighbourhood_groupManhattan         -0.005433606              0.160081940
## neighbourhood_groupQueens             0.087112501             -0.108505293
## neighbourhood_groupStaten Island      0.057884814             -0.008390983
##                                  room_typePrivate room room_typeShared room
## number_of_reviews                          0.017253226        -0.0233549043
## price                                     -0.240246424        -0.0536128152
## minimum_nights                            -0.073836539        -0.0042179463
## availability_365                          -0.010985839         0.0582938901
## room_typeEntire home/apt                  -0.953470178        -0.1621435919
## room_typePrivate room                      1.000000000        -0.1428987359
## room_typeShared room                      -0.142898736         1.0000000000
## neighbourhood_groupBronx                   0.042765002         0.0310413460
## neighbourhood_groupBrooklyn                0.079464133        -0.0174667259
## neighbourhood_groupManhattan              -0.157762374        -0.0091690009
## neighbourhood_groupQueens                  0.100676348         0.0266930678
## neighbourhood_groupStaten Island           0.008345062         0.0002329651
##                                  neighbourhood_groupBronx
## number_of_reviews                             0.009257903
## price                                        -0.041029979
## minimum_nights                               -0.018185701
## availability_365                              0.060806165
## room_typeEntire home/apt                     -0.052092280
## room_typePrivate room                         0.042765002
## room_typeShared room                          0.031041346
## neighbourhood_groupBronx                      1.000000000
## neighbourhood_groupBrooklyn                  -0.126238876
## neighbourhood_groupManhattan                 -0.134729839
## neighbourhood_groupQueens                    -0.054692899
## neighbourhood_groupStaten Island             -0.013245408
##                                  neighbourhood_groupBrooklyn
## number_of_reviews                                 0.01741364
## price                                            -0.09860261
## minimum_nights                                   -0.03965830
## availability_365                                 -0.07967062
## room_typeEntire home/apt                         -0.07390507
## room_typePrivate room                             0.07946413
## room_typeShared room                             -0.01746673
## neighbourhood_groupBronx                         -0.12623888
## neighbourhood_groupBrooklyn                       1.00000000
## neighbourhood_groupManhattan                     -0.74524041
## neighbourhood_groupQueens                        -0.30252659
## neighbourhood_groupStaten Island                 -0.07326523
##                                  neighbourhood_groupManhattan
## number_of_reviews                                -0.045820056
## price                                             0.163975505
## minimum_nights                                    0.067362031
## availability_365                                 -0.005433606
## room_typeEntire home/apt                          0.160081940
## room_typePrivate room                            -0.157762374
## room_typeShared room                             -0.009169001
## neighbourhood_groupBronx                         -0.134729839
## neighbourhood_groupBrooklyn                      -0.745240413
## neighbourhood_groupManhattan                      1.000000000
## neighbourhood_groupQueens                        -0.322874856
## neighbourhood_groupStaten Island                 -0.078193131
##                                  neighbourhood_groupQueens
## number_of_reviews                               0.03596654
## price                                          -0.08020500
## minimum_nights                                 -0.03262903
## availability_365                                0.08711250
## room_typeEntire home/apt                       -0.10850529
## room_typePrivate room                           0.10067635
## room_typeShared room                            0.02669307
## neighbourhood_groupBronx                       -0.05469290
## neighbourhood_groupBrooklyn                    -0.30252659
## neighbourhood_groupManhattan                   -0.32287486
## neighbourhood_groupQueens                       1.00000000
## neighbourhood_groupStaten Island               -0.03174211
##                                  neighbourhood_groupStaten Island
## number_of_reviews                                    0.0150881597
## price                                               -0.0138399434
## minimum_nights                                      -0.0093996218
## availability_365                                     0.0578848136
## room_typeEntire home/apt                            -0.0083909829
## room_typePrivate room                                0.0083450616
## room_typeShared room                                 0.0002329651
## neighbourhood_groupBronx                            -0.0132454075
## neighbourhood_groupBrooklyn                         -0.0732652325
## neighbourhood_groupManhattan                        -0.0781931311
## neighbourhood_groupQueens                           -0.0317421076
## neighbourhood_groupStaten Island                     1.0000000000
corrplot(corr=cor(airMROneHot),order = "AOE")

Which means those variable have weak correlation.